{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.3 Gensim\n",
    "\n",
    "Gensim 是一个开源的python库，可以将文档表示为语义向量。\n",
    "\n",
    "官网：https://radimrehurek.com/gensim/\n",
    "\n",
    "- Word2vec\n",
    "- FastText\n",
    "- TF-IDF, LSA, LDA\n",
    "\n",
    "思考：为什么要把词表示为向量？\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "['fasttext-wiki-news-subwords-300',\n",
       " 'conceptnet-numberbatch-17-06-300',\n",
       " 'word2vec-ruscorpora-300',\n",
       " 'word2vec-google-news-300',\n",
       " 'glove-wiki-gigaword-50',\n",
       " 'glove-wiki-gigaword-100',\n",
       " 'glove-wiki-gigaword-200',\n",
       " 'glove-wiki-gigaword-300',\n",
       " 'glove-twitter-25',\n",
       " 'glove-twitter-50',\n",
       " 'glove-twitter-100',\n",
       " 'glove-twitter-200',\n",
       " '__testing_word2vec-matrix-synopsis']"
      ]
     },
     "metadata": {},
     "execution_count": 3
    }
   ],
   "source": [
    "import gensim\n",
    "import gensim.downloader as api\n",
    "\n",
    "list(api.info()['models'].keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "glove_vectors = api.load('glove-twitter-25')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('facebook', 0.948005199432373),\n",
       " ('tweet', 0.9403422474861145),\n",
       " ('fb', 0.9342359304428101),\n",
       " ('instagram', 0.9104823470115662),\n",
       " ('chat', 0.8964965343475342),\n",
       " ('hashtag', 0.8885936737060547),\n",
       " ('tweets', 0.8878158926963806),\n",
       " ('tl', 0.8778461813926697),\n",
       " ('link', 0.8778210878372192),\n",
       " ('internet', 0.8753897547721863)]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看与'twitter'最相近的词\n",
    "glove_vectors.most_similar('twitter')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.64005 , -0.019514,  0.70148 , -0.66123 ,  1.1723  , -0.58859 ,\n",
       "        0.25917 , -0.81541 ,  1.1708  ,  1.1413  , -0.15405 , -0.11369 ,\n",
       "       -3.8414  , -0.87233 ,  0.47489 ,  1.1541  ,  0.97678 ,  1.1107  ,\n",
       "       -0.14572 , -0.52013 , -0.52234 , -0.92349 ,  0.34651 ,  0.061939,\n",
       "       -0.57375 ], dtype=float32)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 查看'computer'的词向量\n",
    "glove_vectors['computer']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 下面使用文本中预训练的词向量进行情感分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train: 1600\n",
      "test: 400\n"
     ]
    }
   ],
   "source": [
    "from nltk.corpus import movie_reviews\n",
    "import random\n",
    "random.seed(42)\n",
    "\n",
    "\n",
    "def load_movie_reviews():\n",
    "    pos_ids = movie_reviews.fileids('pos')\n",
    "    neg_ids = movie_reviews.fileids('neg')\n",
    "\n",
    "    all_reviews = []\n",
    "    for pids in pos_ids:\n",
    "        all_reviews.append((movie_reviews.raw(pids), 'positive'))\n",
    "    \n",
    "    for nids in neg_ids:\n",
    "        all_reviews.append((movie_reviews.raw(nids), 'negative'))\n",
    "\n",
    "    random.shuffle(all_reviews)\n",
    "    train_reviews = all_reviews[:1600]\n",
    "    test_reviews = all_reviews[1600:]\n",
    "\n",
    "    return train_reviews, test_reviews\n",
    "\n",
    "train_reviews, test_reviews = load_movie_reviews()\n",
    "print('train:', len(train_reviews))\n",
    "print('test:', len(test_reviews))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk import word_tokenize\n",
    "import numpy as np\n",
    "\n",
    "# 将文本中每个词的词向量的平均作为文本的表示\n",
    "def convert_text_to_vector(text, vectors):\n",
    "    vector = np.zeros(vectors.vector_size)\n",
    "    num = 0\n",
    "    for word in word_tokenize(text):\n",
    "        if word in vectors:\n",
    "            vector = vector + vectors[word]\n",
    "            num += 1\n",
    "    if num > 0:\n",
    "        vector = vector / num\n",
    "    return vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_X_y(reviews, vectors):\n",
    "    X = []\n",
    "    Y = []\n",
    "    \n",
    "    for review, polarity in reviews:\n",
    "        x = convert_text_to_vector(review, vectors)\n",
    "        y = 0 if polarity == 'negative' else 1\n",
    "        X.append(x)\n",
    "        Y.append(y)\n",
    "\n",
    "    return X, Y\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, y_train = build_X_y(train_reviews, glove_vectors)\n",
    "X_test, y_test = build_X_y(test_reviews, glove_vectors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.svm import LinearSVC\n",
    "\n",
    "\n",
    "def train_and_test(X_train, y_train, X_test, y_test):\n",
    "    classifier = LinearSVC()\n",
    "\n",
    "    classifier.fit(X_train, y_train)\n",
    "    accuracy = classifier.score(X_test, y_test)\n",
    "    print(f'accuracy is {accuracy:.4f}')\n",
    "\n",
    "    return classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy is 0.7200\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LinearSVC()"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_and_test(X_train, y_train, X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 使用50维预训练的词向量\n",
    "glove_vectors_50 = KeyedVectors.load_word2vec_format('C:\\\\自然语言\\\\glove.twitter.27B.50d.word2vec.txt', binary=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, y_train = build_X_y(train_reviews, glove_vectors_50)\n",
    "X_test, y_test = build_X_y(test_reviews, glove_vectors_50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy is 0.7275\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LinearSVC()"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_and_test(X_train, y_train, X_test, y_test)"
   ]
  },
  {
   "source": [
    "思考：如何进一步的改进？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.9.0 64-bit",
   "metadata": {
    "interpreter": {
     "hash": "12c989a2272087144a907f3af46e789b70a90abf7fd5b4372cac90cccd9eaa13"
    }
   }
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}